載入packages —————————————————————

library("Seurat")
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.3.0 but the current version is
## 4.3.2; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
## 
##     intersect
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("magrittr")

R Markdown

setwd("/Users/benson/Desktop/Transcriptome")
data.dir <- file.path("/Users/benson/Desktop/Transcriptome/final")

將單一樣本資料前處理,寫成一個function ————————————-

(1)讀檔 (matrix.mtx, barcode.tsv, and feature.tsv)

(2)去除duplicated genes

(3)資料載入seurat

(4)計算mito gene的比例

(5)Cell QC

(6)Normalization

(7)Identify highly variable genes

createSeuratObjEachSample <- function(project.name, 
                                      dir,
                                      min.cells = 3,
                                      min.features = 500,
                                      max.features = 6000,
                                      assay = "RNA",
                                      max.mt.percentage = 20,
                                      num.hvf = 2000,
                                      mt.pattern = "^mt-",
                                      normalization.method = "LogNormalize",
                                      hvf.selection.method = "vst"){
  #### (1) Read files
  cat(c("Read files\n"))
  barcode.path <- file.path(dir, "barcodes.tsv")
  features.path <- file.path(dir, "genes.tsv")
  matrix.path <- file.path(dir, "matrix.mtx")
  mat <- Matrix::readMM(file = matrix.path)
  feature.names = read.delim(features.path,
                             header = FALSE,
                             stringsAsFactors = FALSE)
  barcode.names = read.delim(barcode.path,
                             header = FALSE,
                             stringsAsFactors = FALSE)
  colnames(mat) = paste(project.name, barcode.names$V1, sep = "_")
  rownames(mat) = feature.names$V2
  mat <- as.matrix(mat)
  cat(c("-> matrix:", dim(mat), "\n"))
  
  #### (2) 將基因名子一樣的資料加總起來
  cat(c("Remove duplicated genes\n"))
  IDfreqs <- table(rownames(mat))
  IDfreqs <- IDfreqs[IDfreqs > 1]
  cat(c("-> Num. dup. genes:", length(IDfreqs), "\n"))
  if (length(IDfreqs) > 0){
    for (i in names(IDfreqs)) {
      index <- which(rownames(mat) == i)
      duplicates <- as.matrix(mat[index, ])
      sums <- apply(duplicates, 2, sum, na.rm=TRUE)
      
      ### 置換成加總數值
      mat[index[1],] <- sums
      
      ### 將重複的資料移除
      for (j in length(index):2) {
        mat <- mat[-index[j], ]
      }
    }
  }
  cat(c("-> matrix:", dim(mat), "\n"))
  
  #### (3) 資料載入seurat
  cat(c("Create Seurat Object\n"))
  seurat.obj <- Seurat::CreateSeuratObject(counts = mat, 
                                           project = project.name,
                                           min.cells = min.cells, 
                                           min.features = min.features, 
                                           assay = assay, 
                                           meta.data = NULL)
  cat(c("-> dimension:", dim(seurat.obj), "\n"))
  
  #### (4) 計算mito gene的比例
  cat(c("Calculate MT percentage\n"))
  seurat.obj[["Mito_percent"]] <- Seurat::PercentageFeatureSet(seurat.obj, pattern = mt.pattern)
  
  #### (5) Cell QC
  cat(c("Cell QC\n"))
  min_nFeature <- min.features
  max_nFeature <- max.features
  max_Mito_percent <- max.mt.percentage
  seurat.obj <- seurat.obj %>%
    subset(., subset = nFeature_RNA > min_nFeature & nFeature_RNA < max_nFeature) %>%
    subset(., subset = Mito_percent < max_Mito_percent)
  cat(c("-> dimension:", dim(seurat.obj), "\n"))
  
  #### (6) Normalization
  cat(c("Normalization\n"))
  seurat.obj <- Seurat::NormalizeData(seurat.obj,
                                      normalization.method = normalization.method,
                                      scale.factor = 10000)
  
  #### (7) Identify highly variable genes
  cat(c("High variable genes\n"))
  seurat.obj <- Seurat::FindVariableFeatures(seurat.obj, 
                                             selection.method = hvf.selection.method, 
                                             nfeatures = num.hvf 
  )
  
  return(seurat.obj)
}
k_seurat <- createSeuratObjEachSample (project.nam = "k", 
                                       dir = file.path(data.dir, "K"),
                                       min.cells = 3,
                                       min.features = 500,
                                       max.features = 6000,
                                       assay = "RNA",
                                       max.mt.percentage = 20,
                                       num.hvf = 2000,
                                       mt.pattern = "^mt-",
                                       normalization.method = "LogNormalize",
                                       hvf.selection.method = "vst")
## Read files
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.0 GiB
## -> matrix: 20304 6696 
## Remove duplicated genes
## -> Num. dup. genes: 0 
## -> matrix: 20304 6696 
## Create Seurat Object
## Warning: Data is of class matrix. Coercing to dgCMatrix.
## -> dimension: 18738 6696 
## Calculate MT percentage
## Cell QC
## -> dimension: 18738 6696 
## Normalization
## Normalizing layer: counts
## High variable genes
## Finding variable features for layer counts
kl_seurat <- createSeuratObjEachSample (project.nam = "kl", 
                                        dir = file.path(data.dir, "KL"),
                                        min.cells = 3,
                                        min.features = 500,
                                        max.features = 6000,
                                        assay = "RNA",
                                        max.mt.percentage = 20,
                                        num.hvf = 2000,
                                        mt.pattern = "^mt-",
                                        normalization.method = "LogNormalize",
                                        hvf.selection.method = "vst")
## Read files
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.1 GiB
## -> matrix: 20304 7564 
## Remove duplicated genes
## -> Num. dup. genes: 0 
## -> matrix: 20304 7564 
## Create Seurat Object
## Warning: Data is of class matrix. Coercing to dgCMatrix.
## -> dimension: 19458 7564 
## Calculate MT percentage
## Cell QC
## -> dimension: 19458 7564 
## Normalization
## Normalizing layer: counts
## High variable genes
## Finding variable features for layer counts
# 觀察兩個Seurat object
VlnPlot(k_seurat, features = c("nFeature_RNA", "nCount_RNA", "Mito_percent"), ncol = 3)

VlnPlot(kl_seurat, features = c("nFeature_RNA", "nCount_RNA", "Mito_percent"), ncol = 3)

### 2. Data integration --------------------------------------------------------
# Create an ‘integrated’ data assay for downstream analysis
# Identify cell types that are present in both datasets
# Obtain cell type markers that are conserved in both control and stimulated cells
# Compare the datasets to find cell-type specific responses to stimulation
data.list <- list("k" = k_seurat, "kl" = kl_seurat)
# select features that are repeatedly variable across datasets for integration
features <- Seurat::SelectIntegrationFeatures(object.list = data.list, 
                                              nfeatures = 2000,
                                              verbose = FALSE)
# identify anchors 
anchors <- Seurat::FindIntegrationAnchors(object.list = data.list, 
                                          anchor.features = features,
                                          verbose = F)
# this command creates an 'integrated' data assay
combined.obj <- Seurat::IntegrateData(anchorset = anchors)
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
DefaultAssay(combined.obj) <- "integrated"
dim(combined.obj)
## [1]  2000 14260
combined.obj@meta.data$orig.ident %>% table()
## .
##    k   kl 
## 6696 7564
combined.obj@meta.data %>% head()
##                    orig.ident nCount_RNA nFeature_RNA Mito_percent
## k_AAACCCAAGACAAGCC          k       5388         1842    3.0252413
## k_AAACCCAAGATCGACG          k       8021         2329    2.9173420
## k_AAACCCAAGGCTGTAG          k       7292         2613    2.6330225
## k_AAACCCACAATAGAGT          k       2706          912    0.4065041
## k_AAACCCACAGATTAAG          k       4832         1579    3.7251656
## k_AAACCCAGTATGGTAA          k       8292         2217    3.3043898
### 3. Run the standard workflow for visualization and clustering --------------
combined.obj <- Seurat::ScaleData(combined.obj, verbose = FALSE)
combined.obj <- Seurat::RunPCA(combined.obj, npcs = 50, verbose = FALSE)
ElbowPlot(combined.obj, ndims = 50)

pcs <- 20   # 肉眼判斷為20(符合paper作法)
# visualizing both cells and features that define the PCA
VizDimLoadings(combined.obj, dims = 1:2, reduction = "pca")

DimPlot(combined.obj, reduction = "pca") + NoLegend()

DimHeatmap(combined.obj, dims = 1, cells = 500, balanced = TRUE)

### 4. Run non-linear dimensional reduction (UMAP/tSNE)
combined.obj <- combined.obj %>%
  Seurat::RunUMAP(reduction = "pca", umap.method = "uwot", dims = 1:pcs, verbose = FALSE) %>%
  Seurat::RunTSNE(reduction="pca", dims= 1:pcs, verbose = FALSE) %>%
  Seurat::FindNeighbors(reduction = "pca", dims = 1:pcs, verbose = FALSE) %>%
  Seurat::FindClusters(resolution = 0.06, verbose = FALSE) %>%  # paper提供resolution = 0.06
  Seurat::FindSubCluster("0",resolution = 0.06, graph.name = "integrated_snn", 
                         subcluster.name = "new cluster")  # T cell進一步subcluster
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4739
## Number of edges: 192814
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9576
## Number of communities: 2
## Elapsed time: 0 seconds
# 觀察cluster狀況
Seurat::DimPlot(combined.obj, reduction="umap")  

Seurat::DimPlot(combined.obj, reduction="tsne")

Seurat::DimPlot(combined.obj, reduction="tsne", group.by = "orig.ident")

Seurat::DimPlot(combined.obj, reduction="tsne", group.by = "new cluster")

# 觀察Feature狀況
FeaturePlot(combined.obj, reduction="umap", features = "Mito_percent")

### 5. Find markers for each cluster -------------------------------------------
markers <- FindAllMarkers(combined.obj, only.pos = TRUE) %>% 
  group_by(cluster) %>%
  dplyr::filter(avg_log2FC > 1) %>% 
  dplyr::arrange(desc(avg_log2FC)) %>% 
  slice_head(n = 10) %>%
  ungroup() -> top10
## Calculating cluster 0
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 1
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 2
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 3
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 4
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.2, drop = FALSE]): NaNs produced
## Calculating cluster 5
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 6
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 7
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 8
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 9
# 將找到的marker進行視覺化,作heatmap
Seurat::DoHeatmap(combined.obj, group.by = "ident", features = top10$gene,draw.lines = TRUE) + NoLegend()

head(markers)
## # A tibble: 6 × 7
##       p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene   
##       <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>  
## 1 8.33e- 63       5.01 0.155 0.061 1.67e- 59 0       Il17f  
## 2 4.75e-180       4.99 0.219 0.041 9.49e-177 0       Cd163l1
## 3 0               4.71 0.465 0.135 0         0       Dapl1  
## 4 1.75e-127       4.68 0.134 0.036 3.49e-124 0       Trgv2  
## 5 0               4.63 0.121 0.002 0         0       Trav4-2
## 6 3.39e- 62       4.46 0.186 0.091 6.78e- 59 0       Tcrg-C1
# 可以將特定gene作圖
FeaturePlot(combined.obj, reduction="umap", features = "Cd19", label = T)
## Warning: Could not find Cd19 in the default search locations, found in 'RNA'
## assay instead

VlnPlot(combined.obj, features = c("Cd19", "Cd3e"))
## Warning: Could not find Cd19 in the default search locations, found in 'RNA'
## assay instead
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

### 6.annotation(from paper) -----------------------------------------------------
b_cell_mark <- c("Cd19","Cd22","Cd79a","Cd79b")
cancer_mark <- c("Epcam","Krt18","Krt8","Krt19")
dendritic_mark <-  c("Cd86","Cd83","Clec10a","Ccl22","Cxcl16","Xcr1","Rab7b","Naaa","Btla")
endo_mark <-  c("Pecam1","Tek","Plvap","Thbd","Tmem100","Adgrf5","Podxl","Nostrin","Acvrl1")
macrophage_mark <- c("Adgre1","Itgam","Itgal","Cd14","Ccl9","F13a1","Cx3cr1","Cd68")
neutrophil_mark <- c("Csf3r","Cxcr2S100a8")
nk_mark <- c("Klrd1","Nkg7","Prf1","Gzma","Gzmb","Ccl5","Cma1","Klre1","Klra4")
plasma_dend_mark <- c("Ccr9","Bst2")
t_cell_mark <- c("Cd3d","Cd5","Cd3e")
# 觀察cell cluster marker是否能夠代表該cluster
FeaturePlot(combined.obj, reduction="umap", features = b_cell_mark[1:4], label = T)
## Warning: Could not find Cd19 in the default search locations, found in 'RNA'
## assay instead
## Warning: Could not find Cd22 in the default search locations, found in 'RNA'
## assay instead

FeaturePlot(combined.obj, reduction="umap", features = cancer_mark[1:4], label = T)

FeaturePlot(combined.obj, reduction="umap", features = dendritic_mark[1:4], label = T)

FeaturePlot(combined.obj, reduction="umap", features = endo_mark[1:4], label = T)

FeaturePlot(combined.obj, reduction="umap", features = macrophage_mark[1:4], label = T)
## Warning: Could not find Itgal in the default search locations, found in 'RNA'
## assay instead

FeaturePlot(combined.obj, reduction="umap", features = neutrophil_mark, label = T)
## Warning: The following requested variables were not found: Cxcr2S100a8

FeaturePlot(combined.obj, reduction="umap", features = nk_mark[1:4], label = T)

FeaturePlot(combined.obj, reduction="umap", features = plasma_dend_mark, label = T)

# T cell
p1 <- FeaturePlot(combined.obj, reduction="umap", features = "Cd3e", label = T)  # T cell
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead
p2 <- FeaturePlot(combined.obj, reduction="umap", features = "Tcf7", label = T)  # activated T cell
## Warning: Could not find Tcf7 in the default search locations, found in 'RNA'
## assay instead
p3 <- FeaturePlot(combined.obj, reduction="umap", features = "Ctla4", label = T) # exhausted T cell
cowplot::plot_grid(p1, p2, p3, ncol = 3)

VlnPlot(combined.obj, features = c("Cd3e","Tcf7","Ctla4"))
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead
## Warning: Could not find Tcf7 in the default search locations, found in 'RNA'
## assay instead

# cluster annotation
combined.obj$`new cluster`[combined.obj$`new cluster` == "0_0"] <- "T cell activated"
combined.obj$`new cluster`[combined.obj$`new cluster` == "0_1"] <- "T cell exhausted"
combined.obj$`new cluster`[combined.obj$`new cluster` == "1"] <- "Endothelial cell"
combined.obj$`new cluster`[combined.obj$`new cluster` == "2"] <- "Macrophage"
combined.obj$`new cluster`[combined.obj$`new cluster` == "3"] <- "B cell"
combined.obj$`new cluster`[combined.obj$`new cluster` == "4"] <- "Neutrophils"
combined.obj$`new cluster`[combined.obj$`new cluster` == "5"] <- "dendritic cell"
combined.obj$`new cluster`[combined.obj$`new cluster` == "6"] <- "NK cell"
combined.obj$`new cluster`[combined.obj$`new cluster` == "7"] <- "cancer cell1"
combined.obj$`new cluster`[combined.obj$`new cluster` == "8"] <- "cancer cell2"
combined.obj$`new cluster`[combined.obj$`new cluster` == "9"] <- "Plasmacytoid dendritic cell"

combined.obj <- SetIdent(combined.obj, value = combined.obj@meta.data$`new cluster`)
Seurat::DimPlot(combined.obj, reduction="umap", label = T)  

# visualization
features <- c("Cd3e","Ctla4")
# Ridge plots - from ggridges. Visualize single cell expression distributions in each cluster
RidgePlot(combined.obj, features = features, ncol = 2)
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead
## Picking joint bandwidth of 0.0237
## Picking joint bandwidth of 0.022

# Violin plot - Visualize single cell expression distributions in each cluster
VlnPlot(combined.obj, features = features, ncol = 2)
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

VlnPlot(combined.obj, features = features, split.by = "orig.ident")
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##       
## This message will be shown once per session.
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

# Feature plot - visualize feature expression in low-dimensional space
FeaturePlot(combined.obj, features = features)
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

FeaturePlot(combined.obj, features = features, reduction = "umap",blend = TRUE)
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

FeaturePlot(combined.obj, features = features, split.by = "orig.ident")
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

# Dot plots - the size of the dot corresponds to the percentage of cells expressing the
# feature in each cluster. The color represents the average expression level
DotPlot(combined.obj, features = features) + RotatedAxis()
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

# Single cell heatmap of feature expression
DoHeatmap(subset(combined.obj, downsample = 100), features = top10$gene, size = 3) + NoLegend()

# Identify conserved cell type markers
sg <- list(bcell = c("Cd19","Cd22","Cd79a","Cd79b"),
           cancer = c("Epcam","Krt18","Krt8","Krt19"),
           dendritic = c("Cd86","Cd83","Clec10a","Ccl22","Cxcl16","Xcr1","Rab7b","Naaa","Btla"),
           endo = c("Pecam1","Tek","Plvap","Thbd","Tmem100","Adgrf5","Podxl","Nostrin","Acvrl1"),
           macrophage = c("Adgre1","Itgam","Itgal","Cd14","Ccl9","F13a1","Cx3cr1","Cd68"),
           neutro= c("Csf3r","Cxcr2S100a8"),
           nkcell = c("Klrd1","Nkg7","Prf1","Gzma","Gzmb","Ccl5","Cma1","Klre1","Klra4"),
           plasma_dend = c("Ccr9","Bst2"),
           tcell = c("Cd3d","Cd5","Cd3e"))
DotPlot(combined.obj, features = sg, cols = c("blue", "red"), dot.scale = 4, assay = "RNA") +
  RotatedAxis()
## Warning: The following requested variables were not found: Cxcr2S100a8
## Warning: The `facets` argument of `facet_grid()` is deprecated as of ggplot2 2.2.0.
## ℹ Please use the `rows` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.